This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("tidyr")
library("readr")
## dat_crime <- read.csv(unz("https://github.com/avennu01/three-amigos/blob/master/train.csv.zip","train.csv")) .. unable to direclty read data from GitHub
dat_crime <- read.csv("https://www.dropbox.com/s/kjkt5ndf3jkibq4/train.csv?raw=1")
table(dat_crime$Category)
##
## ARSON ASSAULT
## 1513 76876
## BAD CHECKS BRIBERY
## 406 289
## BURGLARY DISORDERLY CONDUCT
## 36755 4320
## DRIVING UNDER THE INFLUENCE DRUG/NARCOTIC
## 2268 53971
## DRUNKENNESS EMBEZZLEMENT
## 4280 1166
## EXTORTION FAMILY OFFENSES
## 256 491
## FORGERY/COUNTERFEITING FRAUD
## 10609 16679
## GAMBLING KIDNAPPING
## 146 2341
## LARCENY/THEFT LIQUOR LAWS
## 174900 1903
## LOITERING MISSING PERSON
## 1225 25989
## NON-CRIMINAL OTHER OFFENSES
## 92304 126182
## PORNOGRAPHY/OBSCENE MAT PROSTITUTION
## 22 7484
## RECOVERED VEHICLE ROBBERY
## 3138 23000
## RUNAWAY SECONDARY CODES
## 1946 9985
## SEX OFFENSES FORCIBLE SEX OFFENSES NON FORCIBLE
## 4388 148
## STOLEN PROPERTY SUICIDE
## 4540 508
## SUSPICIOUS OCC TREA
## 31414 6
## TRESPASS VANDALISM
## 7326 44725
## VEHICLE THEFT WARRANTS
## 53781 42214
## WEAPON LAWS
## 8555
# seperating Dates in to Date & Time and selecting below categories only, rest have been eliminated based on group consensus since they dint have impact on our end objective
# Missing Person .. Need more deepdive
dat_crimeNew <- dat_crime %>% separate(col = Dates, into = c("Date","Time"), sep = " ",fill = "right" ) %>% filter(Category %in% c("ASSAULT","DISORDERLY CONDUCT","DRUNKENNESS","DRUG/NARCOTIC","KIDNAPPING","LARCENY/THEFT","LOITERING","MISSING PERSON","NON-CRIMINAL","ROBBERY","SECONDARY CODES","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE","STOLEN PROPERTY","SUSPICIOUS OCC","VANDALISM","VEHICLE THEFT"))
dat_occ <- dat_crimeNew %>% group_by(Category,Descript) %>% summarize(occurence= n())
# Based of above table, below considerations have been made
# Descripts to consider apart from one's selected .... ASSAULT, BATTERY,STALKING,LOITERING,
# complete categories to consider "KIDNAPPING","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE"
# complete categories to ingore "VANDALISM","VEHICLE THEFT"
dat_crimeNew <- dat_crime %>% separate(col = Dates, into = c("Date","Time"), sep = " ",fill = "right" ) %>% filter(Category %in% c("ASSAULT","DISORDERLY CONDUCT","DRUNKENNESS","DRUG/NARCOTIC","KIDNAPPING","LARCENY/THEFT","LOITERING","MISSING PERSON","NON-CRIMINAL","ROBBERY","SECONDARY CODES","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE","STOLEN PROPERTY","SUSPICIOUS OCC"))
dat_occ <- dat_crimeNew %>% group_by(Category,Descript) %>% summarize(occurence= n())
p <- dat_occ %>% spread(Descript,occurence)
final_desc <- c("AGGRAVATED ASSAULT WITH","ASSAULT, AGGRAVATED, W/","ATTEMPTED HOMICIDE WITH","ATTEMPTED MAYHEM WITH","ATTEMPTED SIMPLE","BATTERY WITH","MAYHEM WITH","THREATS AGAINST","WILLFUL CRUELTY","COMMITTING PUBLIC","DISTURBING THE PEAC","MAINTAINING A PUBLIC","FOR SALE","SALE OF","UNDER INFLUENCE","UNDER THE INFLUENCE","ATTEMPTED PETTY THEFT","GRAND THEFT PICK","GRAND THEFT PURSE","PETTY THEFT","THEFT, DRUNK ROLL,","AIDED CASE, DOG","AIDED CASE, INJURED","ASSAULT TO ROB WITH ","ATTEMPTED ROBBERY ON THE STREET","ATTEMPTED ROBBERY WITH","ROBBERY ON THE STREET ","ROBBERY, ARMED WITH A ","ROBBERY, BODILY","ASSAULT BY JUVENILE","SHOOTING BY JUVENILE","ANNOY OR MOLEST","STOLEN CELLULAR PHONE","STOLEN ELECTRONICS","SUSPICIOUS A","SUSPICIOUS OCCU","SUSPICIOUS PER","MISSING")
select_desc <- function(x) {
names(p %>% select(contains(x)))[-1]
}
dat_crimefinal <- full_join(dat_crimeNew %>% filter(Descript %in% unlist(lapply(final_desc,select_desc))),dat_crimeNew %>% filter(Category %in% c("KIDNAPPING","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE")))
## Joining by: c("Date", "Time", "Category", "Descript", "DayOfWeek", "PdDistrict", "Resolution", "Address", "X", "Y")
# Reduction in observations
dim(dat_crime)
## [1] 878049 9
dim(dat_crimeNew)
## [1] 509681 10
dim(dat_crimefinal)
## [1] 195711 10
# Based on the dat_occ_final we can categorise the level of serverity of crime if we want to show our results in such manner
dat_occ_final <- dat_crimefinal %>% group_by(Category,Descript) %>% summarize(occurence= n())
cat_life <- dat_crimefinal %>% filter(Category %in% c("ASSAULT","KIDNAPPING","MISSING PERSON","SECONDARY CODES","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE","SUSPICIOUS OCC"))
cat_prop <- dat_crimefinal %>% filter(Category %in% c("STOLEN PROPERTY","ROBBERY","LARCENY/THEFT"))
cat_nui <- dat_crimefinal %>% filter(Category %in% c("DISORDERLY CONDUCT","DRUG/NARCOTIC","DRUNKENNESS","NON-CRIMINAL"))
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.2.5
## Loading required package: ggplot2
x <- dat_crimefinal$X
y <- dat_crimefinal$Y
map_SF <- get_map(location = "San Francisco", zoom = 12)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=San+Francisco&zoom=12&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=San%20Francisco&sensor=false
map <- ggmap(map_SF)
#https://rpubs.com/nickbearman/r-google-map-making
#plot the roads Google Maps basemap
#map <- qmap('San Francisco', zoom = 15, maptype = 'roadmap')
#plot the density map
#map + stat_density2d(aes(x = x, y = y, fill = ..level.., alpha = ..level..), data = dat_crimefinal, geom = "polygon") + scale_fill_gradient(low = "blue", high = "red")
#http://www.r-bloggers.com/contour-and-density-layers-with-ggmap/
#Heat map using all data across all categories
W <- dat_crimefinal
ggmap(map_SF, extent = "panel", maprange=FALSE) +
geom_density2d(data = W, aes(x = x, y = y)) +
stat_density2d(data = W, aes(x = x, y = y, fill = ..level.., alpha = ..level..),
size = 0.01, bins = 16, geom = 'polygon') +
scale_fill_gradient(low = "blue", high = "red") +
theme(legend.position = "none")
## Warning: Removed 9 rows containing non-finite values (stat_density2d).
## Warning: Removed 9 rows containing non-finite values (stat_density2d).
#Heat map across category deemed as property
x1 <- cat_prop$X
y1 <- cat_prop$Y
ggmap(map_SF, extent = "panel", maprange=FALSE) +
geom_density2d(data = cat_prop, aes(x = x1, y = y1)) +
stat_density2d(data = cat_prop, aes(x = x1, y = y1, fill = ..level.., alpha = ..level..),
size = 0.01, bins = 16, geom = 'polygon') +
scale_fill_gradient(low = "green", high = "black") +
theme(legend.position = "none")
## Warning: Removed 3 rows containing non-finite values (stat_density2d).
## Warning: Removed 3 rows containing non-finite values (stat_density2d).
#Heat map across category deemed as life
x2 <- cat_life$X
y2 <- cat_life$Y
ggmap(map_SF, extent = "panel", maprange=FALSE) +
geom_density2d(data = cat_life, aes(x = x2, y = y2)) +
stat_density2d(data = cat_life, aes(x = x2, y = y2, fill = ..level.., alpha = ..level..),
size = 0.01, bins = 16, geom = 'polygon') +
scale_fill_gradient(low = "yellow", high = "blue") +
theme(legend.position = "none")
## Warning: Removed 4 rows containing non-finite values (stat_density2d).
## Warning: Removed 4 rows containing non-finite values (stat_density2d).
#Heat map across category deemed as nuisance
x3 <- cat_nui$X
y3 <- cat_nui$Y
ggmap(map_SF, extent = "panel", maprange=FALSE) +
geom_density2d(data = cat_nui, aes(x = x3, y = y3)) +
stat_density2d(data = cat_nui, aes(x = x3, y = y3, fill = ..level.., alpha = ..level..),
size = 0.01, bins = 16, geom = 'polygon') +
scale_fill_gradient(low = "yellow", high = "red") +
theme(legend.position = "none")
## Warning: Removed 2 rows containing non-finite values (stat_density2d).
## Warning: Removed 2 rows containing non-finite values (stat_density2d).
#Scatter plots
#https://rpubs.com/hegupta/151080
dat <- dat_crimefinal %>% mutate(class = ifelse(Category %in% c("STOLEN PROPERTY","ROBBERY","LARCENY/THEFT"), "Prop", ifelse(Category %in% c("DISORDERLY CONDUCT","DRUG/NARCOTIC","DRUNKENNESS","NON-CRIMINAL"), "Nui", "Life")))
dat1 <- dat %>% group_by(Category) %>% summarize(occurence = n())
#Histogram by Category
qplot(Category, data = dat, geom = "bar", fill = Category) +
ggtitle("Crime Categories in San Francisco") +
xlab("Category") +
ylab("Occurence")
qplot(dat$PdDistrict, data = dat, geom = "bar", fill = class) +
scale_x_discrete(label = abbreviate) +
ggtitle("Crime by district and class in San Francisco") +
xlab("District") +
ylab("Crimes by class")
#Further analysis of Southern District
#Daily average crime by class in southern district
dat2 <- dat %>% filter(PdDistrict == "SOUTHERN") %>% group_by(DayOfWeek, class) %>% summarize(DailyAvg = mean(n()))
ggplot(dat2, aes(x = dat2$DayOfWeek, y = dat2$DailyAvg, fill = factor(dat2$class) )) + geom_bar(stat="identity") + labs(x = "Day of Week", y = "Daily Average", title = "Daily Avg Crime by class for Southern District")
#Time of Day analysis for Southern District
dat3 <- dat %>% filter(PdDistrict=="SOUTHERN")
time <- as.data.frame(table(dat3$Time),stringsAsFactors = FALSE)
time$Hour <- substr(time$Var1, 1, 2)
TimeOfDay <- ggplot(time, aes(x = Hour, Freq))
TimeOfDay + stat_summary(fun.y = sum, geom = "bar", fill = "grey") +
labs(x = "Time of Day", y = "Occurrences", title ="Southern District crime timings")